# Load necessary libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(skimr)
library(rstudioapi)
library(inspectdf)
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.2
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(recipes)
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(purrr)
library(graphics)
library(Hmisc)
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:plotly':
##
## subplot
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(glue)
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
##
## Attaching package: 'h2o'
##
## The following objects are masked from 'package:data.table':
##
## hour, month, week, year
##
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
##
## The following objects are masked from 'package:stats':
##
## cor, sd, var
##
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
# Load dataset (mpg from ggplot2)
data <- ggplot2::mpg
# Explore data using skimr and inspect_na
data %>% skim()
| Name | Piped data |
| Number of rows | 234 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| manufacturer | 0 | 1 | 4 | 10 | 0 | 15 | 0 |
| model | 0 | 1 | 2 | 22 | 0 | 38 | 0 |
| trans | 0 | 1 | 8 | 10 | 0 | 10 | 0 |
| drv | 0 | 1 | 1 | 1 | 0 | 3 | 0 |
| fl | 0 | 1 | 1 | 1 | 0 | 5 | 0 |
| class | 0 | 1 | 3 | 10 | 0 | 7 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| displ | 0 | 1 | 3.47 | 1.29 | 1.6 | 2.4 | 3.3 | 4.6 | 7 | ▇▆▆▃▁ |
| year | 0 | 1 | 2003.50 | 4.51 | 1999.0 | 1999.0 | 2003.5 | 2008.0 | 2008 | ▇▁▁▁▇ |
| cyl | 0 | 1 | 5.89 | 1.61 | 4.0 | 4.0 | 6.0 | 8.0 | 8 | ▇▁▇▁▇ |
| cty | 0 | 1 | 16.86 | 4.26 | 9.0 | 14.0 | 17.0 | 19.0 | 35 | ▆▇▃▁▁ |
| hwy | 0 | 1 | 23.44 | 5.95 | 12.0 | 18.0 | 24.0 | 27.0 | 44 | ▅▅▇▁▁ |
data %>% inspect_na()
## # A tibble: 11 × 3
## col_name cnt pcnt
## <chr> <int> <dbl>
## 1 manufacturer 0 0
## 2 model 0 0
## 3 displ 0 0
## 4 year 0 0
## 5 cyl 0 0
## 6 trans 0 0
## 7 drv 0 0
## 8 cty 0 0
## 9 hwy 0 0
## 10 fl 0 0
## 11 class 0 0
# Identify numeric variables
names(data)
## [1] "manufacturer" "model" "displ" "year" "cyl"
## [6] "trans" "drv" "cty" "hwy" "fl"
## [11] "class"
num_vars <- data %>%
select_if(is.numeric) %>%
names()
num_vars
## [1] "displ" "year" "cyl" "cty" "hwy"
# Identify and handle outliers using boxplots
for (b in num_vars) {
OutVals <- boxplot(data[[b]])$out
if (length(OutVals) > 0) {
print(paste0("----", b))
print(OutVals)
}
}
## [1] "----cty"
## [1] 28 28 33 35 29
## [1] "----hwy"
## [1] 44 44 41
# Replace outliers in 'cty' variable with quartile values
OutVals <- boxplot(data[["cty"]])$out
median <- median(data[["cty"]])
o3 <- ifelse(OutVals > median, OutVals, NA) %>% na.omit() %>% as.matrix() %>% t() %>% .[1,]
o1 <- ifelse(OutVals < median, OutVals, NA) %>% na.omit() %>% as.matrix() %>% t() %>% .[1,]
data <- na.omit(data)
val75 <- quantile(data[["cty"]], 0.75)
val25 <- quantile(data[["cty"]], 0.25)
data[which(data[["cty"]] %in% o3), "cty"] <- val75
data[which(data[["cty"]] %in% o1), "cty"] <- val25
boxplot(data[["cty"]])
# Prepare data for modeling
names(data)
## [1] "manufacturer" "model" "displ" "year" "cyl"
## [6] "trans" "drv" "cty" "hwy" "fl"
## [11] "class"
target <- "cty"
features <- data %>% select(c("year", "cyl", "displ"))
# Build and refine a linear regression model using H2O
f <- as.formula(paste(target, paste(features, collapse = " + "), sep = " ~ "))
glm <- glm(f, data = data)
glm %>% summary()
##
## Call:
## glm(formula = f, data = data)
##
## Coefficients:
## Estimate
## (Intercept) -172.06798
## c(1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, \n 1999, 1999, 1999, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, \n 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008) 0.09952
## c(4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 8, 8, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, \n 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 5, 5, 6, 6, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 6, 6, 6) -1.20798
## c(1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 3.1, 2.8, 3.1, 4.2, 5.3, 5.3, 5.3, 5.7, 6, 5.7, 5.7, 6.2, 6.2, 7, 5.3, 5.3, 5.7, 6.5, 2.4, 2.4, 3.1, 3.5, 3.6, 2.4, 3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.8, 3.8, 3.8, 4, 3.7, 3.7, 3.9, 3.9, 4.7, 4.7, 4.7, 5.2, 5.2, 3.9, 4.7, 4.7, 4.7, 5.2, 5.7, 5.9, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 5.2, 5.2, 5.7, 5.9, 4.6, 5.4, 5.4, 4, 4, 4, 4, 4.6, 5, 4.2, 4.2, 4.6, 4.6, 4.6, 5.4, 5.4, 3.8, 3.8, 4, 4, 4.6, 4.6, 4.6, 4.6, 5.4, 1.6, 1.6, 1.6, 1.6, 1.6, 1.8, 1.8, \n 1.8, 2, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 3.3, 2, 2, 2, 2, 2.7, 2.7, 2.7, 3, 3.7, 4, 4.7, 4.7, 4.7, 5.7, 6.1, 4, 4.2, 4.4, 4.6, 5.4, 5.4, 5.4, 4, 4, 4.6, 5, 2.4, 2.4, 2.5, 2.5, 3.5, 3.5, 3, 3, 3.5, 3.3, 3.3, 4, 5.6, 3.1, 3.8, 3.8, 3.8, 5.3, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.2, 2.2, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.7, 2.7, 3.4, 3.4, 4, 4.7, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.5, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.3, 1.8, 1.8, 1.8, 1.8, 1.8, 4.7, 5.7, 2.7, 2.7, 2.7, 3.4, 3.4, 4, 4, 2, 2, 2, 2, 2.8, 1.9, 2, 2, 2, 2, \n 2.5, 2.5, 2.8, 2.8, 1.9, 1.9, 2, 2, 2.5, 2.5, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.6) -1.03653
## Std. Error
## (Intercept) 56.82357
## c(1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, \n 1999, 1999, 1999, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, \n 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008) 0.02838
## c(4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 8, 8, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, \n 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 5, 5, 6, 6, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 6, 6, 6) 0.21401
## c(1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 3.1, 2.8, 3.1, 4.2, 5.3, 5.3, 5.3, 5.7, 6, 5.7, 5.7, 6.2, 6.2, 7, 5.3, 5.3, 5.7, 6.5, 2.4, 2.4, 3.1, 3.5, 3.6, 2.4, 3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.8, 3.8, 3.8, 4, 3.7, 3.7, 3.9, 3.9, 4.7, 4.7, 4.7, 5.2, 5.2, 3.9, 4.7, 4.7, 4.7, 5.2, 5.7, 5.9, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 5.2, 5.2, 5.7, 5.9, 4.6, 5.4, 5.4, 4, 4, 4, 4, 4.6, 5, 4.2, 4.2, 4.6, 4.6, 4.6, 5.4, 5.4, 3.8, 3.8, 4, 4, 4.6, 4.6, 4.6, 4.6, 5.4, 1.6, 1.6, 1.6, 1.6, 1.6, 1.8, 1.8, \n 1.8, 2, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 3.3, 2, 2, 2, 2, 2.7, 2.7, 2.7, 3, 3.7, 4, 4.7, 4.7, 4.7, 5.7, 6.1, 4, 4.2, 4.4, 4.6, 5.4, 5.4, 5.4, 4, 4, 4.6, 5, 2.4, 2.4, 2.5, 2.5, 3.5, 3.5, 3, 3, 3.5, 3.3, 3.3, 4, 5.6, 3.1, 3.8, 3.8, 3.8, 5.3, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.2, 2.2, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.7, 2.7, 3.4, 3.4, 4, 4.7, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.5, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.3, 1.8, 1.8, 1.8, 1.8, 1.8, 4.7, 5.7, 2.7, 2.7, 2.7, 3.4, 3.4, 4, 4, 2, 2, 2, 2, 2.8, 1.9, 2, 2, 2, 2, \n 2.5, 2.5, 2.8, 2.8, 1.9, 1.9, 2, 2, 2.5, 2.5, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.6) 0.26789
## t value
## (Intercept) -3.028
## c(1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, \n 1999, 1999, 1999, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, \n 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008) 3.507
## c(4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 8, 8, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, \n 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 5, 5, 6, 6, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 6, 6, 6) -5.644
## c(1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 3.1, 2.8, 3.1, 4.2, 5.3, 5.3, 5.3, 5.7, 6, 5.7, 5.7, 6.2, 6.2, 7, 5.3, 5.3, 5.7, 6.5, 2.4, 2.4, 3.1, 3.5, 3.6, 2.4, 3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.8, 3.8, 3.8, 4, 3.7, 3.7, 3.9, 3.9, 4.7, 4.7, 4.7, 5.2, 5.2, 3.9, 4.7, 4.7, 4.7, 5.2, 5.7, 5.9, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 5.2, 5.2, 5.7, 5.9, 4.6, 5.4, 5.4, 4, 4, 4, 4, 4.6, 5, 4.2, 4.2, 4.6, 4.6, 4.6, 5.4, 5.4, 3.8, 3.8, 4, 4, 4.6, 4.6, 4.6, 4.6, 5.4, 1.6, 1.6, 1.6, 1.6, 1.6, 1.8, 1.8, \n 1.8, 2, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 3.3, 2, 2, 2, 2, 2.7, 2.7, 2.7, 3, 3.7, 4, 4.7, 4.7, 4.7, 5.7, 6.1, 4, 4.2, 4.4, 4.6, 5.4, 5.4, 5.4, 4, 4, 4.6, 5, 2.4, 2.4, 2.5, 2.5, 3.5, 3.5, 3, 3, 3.5, 3.3, 3.3, 4, 5.6, 3.1, 3.8, 3.8, 3.8, 5.3, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.2, 2.2, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.7, 2.7, 3.4, 3.4, 4, 4.7, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.5, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.3, 1.8, 1.8, 1.8, 1.8, 1.8, 4.7, 5.7, 2.7, 2.7, 2.7, 3.4, 3.4, 4, 4, 2, 2, 2, 2, 2.8, 1.9, 2, 2, 2, 2, \n 2.5, 2.5, 2.8, 2.8, 1.9, 1.9, 2, 2, 2.5, 2.5, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.6) -3.869
## Pr(>|t|)
## (Intercept) 0.002742
## c(1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, \n 1999, 1999, 1999, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, \n 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008) 0.000545
## c(4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 8, 8, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, \n 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 5, 5, 6, 6, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 6, 6, 6) 4.85e-08
## c(1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 3.1, 2.8, 3.1, 4.2, 5.3, 5.3, 5.3, 5.7, 6, 5.7, 5.7, 6.2, 6.2, 7, 5.3, 5.3, 5.7, 6.5, 2.4, 2.4, 3.1, 3.5, 3.6, 2.4, 3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.8, 3.8, 3.8, 4, 3.7, 3.7, 3.9, 3.9, 4.7, 4.7, 4.7, 5.2, 5.2, 3.9, 4.7, 4.7, 4.7, 5.2, 5.7, 5.9, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 5.2, 5.2, 5.7, 5.9, 4.6, 5.4, 5.4, 4, 4, 4, 4, 4.6, 5, 4.2, 4.2, 4.6, 4.6, 4.6, 5.4, 5.4, 3.8, 3.8, 4, 4, 4.6, 4.6, 4.6, 4.6, 5.4, 1.6, 1.6, 1.6, 1.6, 1.6, 1.8, 1.8, \n 1.8, 2, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 3.3, 2, 2, 2, 2, 2.7, 2.7, 2.7, 3, 3.7, 4, 4.7, 4.7, 4.7, 5.7, 6.1, 4, 4.2, 4.4, 4.6, 5.4, 5.4, 5.4, 4, 4, 4.6, 5, 2.4, 2.4, 2.5, 2.5, 3.5, 3.5, 3, 3, 3.5, 3.3, 3.3, 4, 5.6, 3.1, 3.8, 3.8, 3.8, 5.3, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.2, 2.2, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.7, 2.7, 3.4, 3.4, 4, 4.7, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.5, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.3, 1.8, 1.8, 1.8, 1.8, 1.8, 4.7, 5.7, 2.7, 2.7, 2.7, 3.4, 3.4, 4, 4, 2, 2, 2, 2, 2.8, 1.9, 2, 2, 2, 2, \n 2.5, 2.5, 2.8, 2.8, 1.9, 1.9, 2, 2, 2.5, 2.5, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.6) 0.000142
##
## (Intercept) **
## c(1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 1999, 2008, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, \n 1999, 1999, 1999, 2008, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, \n 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 1999, 2008, 2008, 1999, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 1999, 1999, 2008, 2008, 2008, 2008, 1999, 1999, 1999, 1999, 1999, 1999, 2008, 2008, 1999, 1999, 2008, 2008, 1999, 1999, 2008) ***
## c(4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 6, 6, 8, 8, 8, 8, 8, 6, 6, 6, 6, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 8, 8, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, \n 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 5, 5, 6, 6, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 6, 6, 6) ***
## c(1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.1, 3.1, 2.8, 3.1, 4.2, 5.3, 5.3, 5.3, 5.7, 6, 5.7, 5.7, 6.2, 6.2, 7, 5.3, 5.3, 5.7, 6.5, 2.4, 2.4, 3.1, 3.5, 3.6, 2.4, 3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.8, 3.8, 3.8, 4, 3.7, 3.7, 3.9, 3.9, 4.7, 4.7, 4.7, 5.2, 5.2, 3.9, 4.7, 4.7, 4.7, 5.2, 5.7, 5.9, 4.7, 4.7, 4.7, 4.7, 4.7, 4.7, 5.2, 5.2, 5.7, 5.9, 4.6, 5.4, 5.4, 4, 4, 4, 4, 4.6, 5, 4.2, 4.2, 4.6, 4.6, 4.6, 5.4, 5.4, 3.8, 3.8, 4, 4, 4.6, 4.6, 4.6, 4.6, 5.4, 1.6, 1.6, 1.6, 1.6, 1.6, 1.8, 1.8, \n 1.8, 2, 2.4, 2.4, 2.4, 2.4, 2.5, 2.5, 3.3, 2, 2, 2, 2, 2.7, 2.7, 2.7, 3, 3.7, 4, 4.7, 4.7, 4.7, 5.7, 6.1, 4, 4.2, 4.4, 4.6, 5.4, 5.4, 5.4, 4, 4, 4.6, 5, 2.4, 2.4, 2.5, 2.5, 3.5, 3.5, 3, 3, 3.5, 3.3, 3.3, 4, 5.6, 3.1, 3.8, 3.8, 3.8, 5.3, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.2, 2.2, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.7, 2.7, 3.4, 3.4, 4, 4.7, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.5, 2.2, 2.2, 2.4, 2.4, 3, 3, 3.3, 1.8, 1.8, 1.8, 1.8, 1.8, 4.7, 5.7, 2.7, 2.7, 2.7, 3.4, 3.4, 4, 4, 2, 2, 2, 2, 2.8, 1.9, 2, 2, 2, 2, \n 2.5, 2.5, 2.8, 2.8, 1.9, 1.9, 2, 2, 2.5, 2.5, 1.8, 1.8, 2, 2, 2.8, 2.8, 3.6) ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 3.725912)
##
## Null deviance: 3243.61 on 233 degrees of freedom
## Residual deviance: 856.96 on 230 degrees of freedom
## AIC: 977.81
##
## Number of Fisher Scoring iterations: 2
# Check and handle multicollinearity using VIF
library(faraway)
##
## Attaching package: 'faraway'
## The following object is masked from 'package:lattice':
##
## melanoma
## The following object is masked from 'package:mice':
##
## mammalsleep
while (glm %>% faraway::vif() %>% sort(decreasing = TRUE) %>% .[1] >= 1.5) {
afterVIF <- glm %>% faraway::vif() %>% sort(decreasing = TRUE) %>% .[-1] %>% names()
f <- as.formula(paste(target, paste(afterVIF, collapse = " + "), sep = " ~ "))
glm <- glm(f, data = data)
}
# Display final VIF results
glm %>% faraway::vif() %>% sort(decreasing = TRUE) %>% names() -> features
# Prepare data for modeling
data <- data %>%
select(cty, year, cyl, displ) %>%
glimpse()
## Rows: 234
## Columns: 4
## $ cty <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 15, 15, …
## $ year <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 2008, 2008…
## $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8…
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.8, 2.8,…
# Standardize predictor variables
data[, -1] <- data[, -1] %>% scale() %>% as.data.frame()
# Initialize H2O
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 41 minutes 11 seconds
## H2O cluster timezone: Asia/Baku
## H2O data parsing timezone: UTC
## H2O cluster version: 3.42.0.2
## H2O cluster version age: 5 months and 3 days
## H2O cluster name: H2O_started_from_R_ACER_osn291
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.77 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.1 (2023-06-16 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (5 months and 3 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert data to H2O frame
h2o_data <- data %>% as.h2o()
##
|
| | 0%
|
|======================================================================| 100%
h2o_data <- h2o_data %>% h2o.splitFrame(ratios = 0.8, seed = 123)
# Define target and features
target <- "cty"
features <- data %>% select(c("year", "cyl", "displ")) %>% names()
# Split data into training and testing sets
train <- h2o_data[[1]]
test <- h2o_data[[2]]
# Build H2O GLM model
model <- h2o.glm(
x = features, y = target,
training_frame = train,
validation_frame = test,
seed = 123, nfolds = 10,
lambda = 0,
compute_p_values = TRUE
)
##
|
| | 0%
|
|======================================================================| 100%
# Display model summary
summary(model)
## Model Details:
## ==============
##
## H2ORegressionModel: glm
## Model Key: GLM_model_R_1703759888795_16
## GLM Model: summary
## family link regularization number_of_predictors_total
## 1 gaussian identity None 3
## number_of_active_predictors number_of_iterations training_frame
## 1 3 1 RTMP_sid_82fc_3
##
## H2ORegressionMetrics: glm
## ** Reported on training data. **
##
## MSE: 3.43623
## RMSE: 1.853707
## MAE: 1.428847
## RMSLE: 0.1110743
## Mean Residual Deviance : 3.43623
## R^2 : 0.7391495
## Null Deviance :2423.864
## Null D.o.F. :183
## Residual Deviance :632.2662
## Residual D.o.F. :180
## AIC :759.2943
##
##
## H2ORegressionMetrics: glm
## ** Reported on validation data. **
##
## MSE: 4.630496
## RMSE: 2.151859
## MAE: 1.678881
## RMSLE: 0.1379459
## Mean Residual Deviance : 4.630496
## R^2 : 0.7154596
## Null Deviance :821.3956
## Null D.o.F. :49
## Residual Deviance :231.5248
## Residual D.o.F. :46
## AIC :228.5271
##
##
## H2ORegressionMetrics: glm
## ** Reported on cross-validation data. **
## ** 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 3.639883
## RMSE: 1.907848
## MAE: 1.463574
## RMSLE: 0.114575
## Mean Residual Deviance : 3.639883
## R^2 : 0.7236898
## Null Deviance :2476.279
## Null D.o.F. :183
## Residual Deviance :669.7385
## Residual D.o.F. :180
## AIC :769.8885
##
##
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## mae 1.447927 0.303751 1.552973 1.559963 1.142328
## mean_residual_deviance 3.530423 1.323507 3.621001 4.204665 2.452665
## mse 3.530423 1.323507 3.621001 4.204665 2.452665
## null_deviance 247.627900 96.894516 239.961460 306.785000 241.668290
## r2 0.674627 0.161133 0.758537 0.634322 0.818344
## residual_deviance 66.973850 32.842900 57.936016 100.911970 41.695300
## rmse 1.845679 0.371024 1.902893 2.050528 1.566099
## rmsle 0.108994 0.023277 0.101542 0.144045 0.077322
## cv_4_valid cv_5_valid cv_6_valid cv_7_valid cv_8_valid
## mae 0.880118 1.857747 1.815711 1.306672 1.337201
## mean_residual_deviance 1.180759 6.065352 4.356180 2.728392 3.317573
## mse 1.180759 6.065352 4.356180 2.728392 3.317573
## null_deviance 241.559570 411.962200 136.014180 73.008820 309.713260
## r2 0.861630 0.607683 0.487508 0.341815 0.718680
## residual_deviance 17.711380 127.372380 69.698880 46.382668 86.256905
## rmse 1.086627 2.462793 2.087146 1.651784 1.821421
## rmsle 0.077603 0.137223 0.123147 0.104629 0.123642
## cv_9_valid cv_10_valid
## mae 1.666621 1.359937
## mean_residual_deviance 4.310615 3.067023
## mse 4.310615 3.067023
## null_deviance 316.324000 199.282270
## r2 0.720221 0.797529
## residual_deviance 81.901700 39.871300
## rmse 2.076202 1.751292
## rmsle 0.110992 0.089790
##
## Scoring History:
## timestamp duration iterations negative_log_likelihood objective
## 1 2023-12-28 15:19:29 0.000 sec 0 2423.86413 13.17317
## 2 2023-12-28 15:19:29 0.003 sec 1 NA NA
## training_rmse training_deviance training_mae training_r2 validation_rmse
## 1 NA NA NA NA NA
## 2 1.85371 3.43623 1.42885 0.73915 2.15186
## validation_deviance validation_mae validation_r2
## 1 NA NA NA
## 2 4.63050 1.67888 0.71546
##
## Variable Importances: (Extract with `h2o.varimp`)
## =================================================
##
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 displ 1.703743 1.000000 0.462959
## 2 cyl 1.509642 0.886074 0.410216
## 3 year 0.466729 0.273943 0.126825
# Display coefficients with p-values
model@model$coefficients_table %>%
as.data.frame() %>%
dplyr::select(names, p_value) %>%
mutate(p_value = round(p_value, 3)) %>%
.[-1,] %>%
arrange(desc(p_value))
## names p_value
## 1 year 0.001
## 2 cyl 0.000
## 3 displ 0.000
# Make predictions on the test set
y_pred <- model %>% h2o.predict(newdata = test) %>% as.data.frame()
##
|
| | 0%
|
|======================================================================| 100%
y_pred$predict
## [1] 20.22789 20.89541 17.53890 10.48787 19.43621 16.34359 17.27501 16.74722
## [9] 13.52266 13.52266 13.52266 11.93151 13.52266 13.52266 13.52266 16.35138
## [17] 12.72319 13.65460 21.15930 21.15930 20.89541 20.36762 20.89541 18.06669
## [25] 18.06669 11.67539 13.51488 11.66761 20.23567 16.35138 19.30426 19.30426
## [33] 20.23567 19.70010 20.23567 19.70010 19.70010 20.36762 16.73944 17.01111
## [41] 19.70010 16.73944 16.73944 12.59124 16.21165 19.96400 19.96400 19.96400
## [49] 20.22789 20.89541
# Evaluate model performance on the test set
test_set <- test %>% as.data.frame()
residuals <- test_set$cty - y_pred$predict
RMSE = sqrt(mean(residuals^2))
# Calculate R-squared and Adjusted R-squared
y_test_mean = mean(test_set$cty)
tss = sum((test_set$cty - y_test_mean)^2)
rss = sum(residuals^2)
R2 = 1 - (rss/tss)
n <- test_set %>% nrow()
k <- features %>% length()
Adjusted_R2 = 1 - (1 - R2) * ((n - 1) / (n - k - 1))
# Display evaluation metrics
tibble(RMSE = round(RMSE, 1),
R2, Adjusted_R2)
## # A tibble: 1 × 3
## RMSE R2 Adjusted_R2
## <dbl> <dbl> <dbl>
## 1 2.2 0.715 0.697
# Create a dataframe for observed and predicted values
my_data <- cbind(predicted = y_pred$predict,
observed = test_set$cty) %>%
as.data.frame()
# Visualize the results using ggplot
g <- my_data %>%
ggplot(aes(predicted, observed)) +
geom_point(color = "red") +
geom_smooth(method = lm) +
labs(x = "Predicted Power Output",
y = "Observed Power Output",
title = glue('Test: Adjusted R2 = {round(enexpr(Adjusted_R2), 2)}')) +
theme(plot.title = element_text(color = "darkgreen", size = 16, hjust = 0.5),
axis.text.y = element_text(size = 12),
axis.text.x = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
# Convert ggplot to plotly for interactive visualization
g %>% ggplotly()
## `geom_smooth()` using formula = 'y ~ x'
# Make predictions on the training set
y_pred_train <- model %>% h2o.predict(newdata = train) %>% as.data.frame()
##
|
| | 0%
|
|======================================================================| 100%
# Evaluate model performance on the training set
train_set <- train %>% as.data.frame()
residuals_train <- train_set$cty - y_pred_train$predict
RMSE_train = sqrt(mean(residuals_train^2))
y_train_mean = mean(train_set$cty)
# Calculate R-squared and Adjusted R-squared for training set
tss_train = sum((train_set$cty - y_train_mean)^2)
rss_train = sum(residuals_train^2)
R2_train = 1 - (rss_train/tss_train)
n_train <- train_set %>% nrow()
k_train <- features %>% length()
Adjusted_R2_train = 1 - (1 - R2_train) * ((n_train - 1) / (n_train - k_train - 1))
# Create a dataframe for observed and predicted values on the training set
my_data_train <- cbind(predicted = y_pred_train$predict,
observed = train_set$cty) %>%
as.data.frame()
# Visualize the results on the training set using ggplot
g_train <- my_data_train %>%
ggplot(aes(predicted, observed)) +
geom_point(color = "darkred") +
geom_smooth(method = lm) +
labs(x = "Predicted Power Output",
y = "Observed Power Output",
title = glue('Train: Adjusted R2 = {round(enexpr(Adjusted_R2_train), 2)}')) +
theme(plot.title = element_text(color = "darkgreen", size = 16, hjust = 0.5),
axis.text.y = element_text(size = 12),
axis.text.x = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
# Convert ggplot to plotly for interactive visualization
g_train %>% ggplotly()
## `geom_smooth()` using formula = 'y ~ x'
# Display results for both training and test sets
library(patchwork)
g_train + g
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# Display final evaluation metrics
tibble(RMSE_train = round(RMSE_train, 1),
RMSE_test = round(RMSE, 1),
Adjusted_R2_train,
Adjusted_R2_test = Adjusted_R2)
## # A tibble: 1 × 4
## RMSE_train RMSE_test Adjusted_R2_train Adjusted_R2_test
## <dbl> <dbl> <dbl> <dbl>
## 1 1.9 2.2 0.735 0.697